{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "train.csv 有6列：\n",
    "  user：用户ID\n",
    "  event：活动ID\n",
    "  invited：是否被邀请（0/1）\n",
    "  timestamp：ISO-8601 UTC格式时间字符串，表示用户看到该活动的时间\n",
    "  interested, and not_interested\n",
    "\n",
    "Test.csv 除了没有interested, and not_interested，其余列与train相同"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 312,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.cluster import MiniBatchKMeans\n",
    "from sklearn import metrics\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 313,
   "metadata": {},
   "outputs": [],
   "source": [
    "train = pd.read_csv(\"train.csv\")\n",
    "test = pd.read_csv(\"test.csv\")\n",
    "data = train.append(test,sort=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 314,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user</th>\n",
       "      <th>event</th>\n",
       "      <th>invited</th>\n",
       "      <th>timestamp</th>\n",
       "      <th>interested</th>\n",
       "      <th>not_interested</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>3044012</td>\n",
       "      <td>1918771225</td>\n",
       "      <td>0</td>\n",
       "      <td>2012-10-02 15:53:05.754000+00:00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3044012</td>\n",
       "      <td>1502284248</td>\n",
       "      <td>0</td>\n",
       "      <td>2012-10-02 15:53:05.754000+00:00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3044012</td>\n",
       "      <td>2529072432</td>\n",
       "      <td>0</td>\n",
       "      <td>2012-10-02 15:53:05.754000+00:00</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3044012</td>\n",
       "      <td>3072478280</td>\n",
       "      <td>0</td>\n",
       "      <td>2012-10-02 15:53:05.754000+00:00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>3044012</td>\n",
       "      <td>1390707377</td>\n",
       "      <td>0</td>\n",
       "      <td>2012-10-02 15:53:05.754000+00:00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      user       event  invited                         timestamp  interested  \\\n",
       "0  3044012  1918771225        0  2012-10-02 15:53:05.754000+00:00         0.0   \n",
       "1  3044012  1502284248        0  2012-10-02 15:53:05.754000+00:00         0.0   \n",
       "2  3044012  2529072432        0  2012-10-02 15:53:05.754000+00:00         1.0   \n",
       "3  3044012  3072478280        0  2012-10-02 15:53:05.754000+00:00         0.0   \n",
       "4  3044012  1390707377        0  2012-10-02 15:53:05.754000+00:00         0.0   \n",
       "\n",
       "   not_interested  \n",
       "0             0.0  \n",
       "1             0.0  \n",
       "2             0.0  \n",
       "3             0.0  \n",
       "4             0.0  "
      ]
     },
     "execution_count": 314,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 315,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 25635 entries, 0 to 10236\n",
      "Data columns (total 6 columns):\n",
      "user              25635 non-null int64\n",
      "event             25635 non-null int64\n",
      "invited           25635 non-null int64\n",
      "timestamp         25635 non-null object\n",
      "interested        15398 non-null float64\n",
      "not_interested    15398 non-null float64\n",
      "dtypes: float64(2), int64(3), object(1)\n",
      "memory usage: 1.4+ MB\n"
     ]
    }
   ],
   "source": [
    "data.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 316,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user</th>\n",
       "      <th>event</th>\n",
       "      <th>invited</th>\n",
       "      <th>interested</th>\n",
       "      <th>not_interested</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>2.563500e+04</td>\n",
       "      <td>2.563500e+04</td>\n",
       "      <td>25635.000000</td>\n",
       "      <td>15398.000000</td>\n",
       "      <td>15398.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>2.179509e+09</td>\n",
       "      <td>2.065171e+09</td>\n",
       "      <td>0.041467</td>\n",
       "      <td>0.268282</td>\n",
       "      <td>0.033381</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>1.265159e+09</td>\n",
       "      <td>1.191545e+09</td>\n",
       "      <td>0.199371</td>\n",
       "      <td>0.443079</td>\n",
       "      <td>0.179635</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>1.776192e+06</td>\n",
       "      <td>1.040700e+05</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>1.063161e+09</td>\n",
       "      <td>1.060763e+09</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>2.225155e+09</td>\n",
       "      <td>2.007279e+09</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>3.285425e+09</td>\n",
       "      <td>3.054301e+09</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>4.293103e+09</td>\n",
       "      <td>4.294677e+09</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "               user         event       invited    interested  not_interested\n",
       "count  2.563500e+04  2.563500e+04  25635.000000  15398.000000    15398.000000\n",
       "mean   2.179509e+09  2.065171e+09      0.041467      0.268282        0.033381\n",
       "std    1.265159e+09  1.191545e+09      0.199371      0.443079        0.179635\n",
       "min    1.776192e+06  1.040700e+05      0.000000      0.000000        0.000000\n",
       "25%    1.063161e+09  1.060763e+09      0.000000      0.000000        0.000000\n",
       "50%    2.225155e+09  2.007279e+09      0.000000      0.000000        0.000000\n",
       "75%    3.285425e+09  3.054301e+09      0.000000      1.000000        0.000000\n",
       "max    4.293103e+09  4.294677e+09      1.000000      1.000000        1.000000"
      ]
     },
     "execution_count": 316,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 317,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(25635, 6)"
      ]
     },
     "execution_count": 317,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 318,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "user                  0\n",
       "event                 0\n",
       "invited               0\n",
       "timestamp             0\n",
       "interested        10237\n",
       "not_interested    10237\n",
       "dtype: int64"
      ]
     },
     "execution_count": 318,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.isnull().sum()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 训练集和测试集中出现的event次数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 319,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "13418"
      ]
     },
     "execution_count": 319,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(data[\"event\"].unique())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# events数据\n",
    "活动描述信息在events.csv文件：共110维特征\n",
    "前9列：event_id, user_id, start_time, city, state, zip, country, lat, and lng.\n",
    "event_id：活动的id, \n",
    "user_id：创建活动的用户的id .  \n",
    "city, state, zip, and country： 活动地点 (如果知道的话).\n",
    "lat and lng： floats（活动地点的经度和纬度）\n",
    "start_time： 字符串，ISO-8601 UTC time，表示活动开始时间\n",
    "\n",
    "后101列为词频：count_1, count_2, ..., count_100，count_other\n",
    "count_N：活动描述出现第N个词的次数\n",
    "count_other：除了最常用的100个词之外的其余词出现的次数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 320,
   "metadata": {},
   "outputs": [],
   "source": [
    "#抽取出只在训练集和测试集中出现的event\n",
    "file  = open(\"events.csv\",\"r\")\n",
    "eventList = []\n",
    "columns ##列名\n",
    "i = 0 #因为性能的限制，我们取前8W+条数据\n",
    "isStart = True\n",
    "for read in file:\n",
    "    line = read.split(\",\")\n",
    "    index = len(line) - 1\n",
    "    line[index] = line[index].strip(\"\\n\") #去掉最后一个字符的换行号\n",
    "    line\n",
    "    i += 1\n",
    "    if isStart:\n",
    "        columns = line\n",
    "        #第一行为列名\n",
    "    else:\n",
    "        try:\n",
    "            if(int(line[0]) in list(data[\"event\"])):\n",
    "                eventList.append(line)\n",
    "        except ValueError:\n",
    "            print(\"error\")\n",
    "    isStart = False\n",
    "    if(i == 80001):\n",
    "        break\n",
    "file.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 321,
   "metadata": {},
   "outputs": [],
   "source": [
    "events = pd.DataFrame(data=eventList,columns= columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 322,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>event_id</th>\n",
       "      <th>user_id</th>\n",
       "      <th>start_time</th>\n",
       "      <th>city</th>\n",
       "      <th>state</th>\n",
       "      <th>zip</th>\n",
       "      <th>country</th>\n",
       "      <th>lat</th>\n",
       "      <th>lng</th>\n",
       "      <th>c_1</th>\n",
       "      <th>...</th>\n",
       "      <th>c_92</th>\n",
       "      <th>c_93</th>\n",
       "      <th>c_94</th>\n",
       "      <th>c_95</th>\n",
       "      <th>c_96</th>\n",
       "      <th>c_97</th>\n",
       "      <th>c_98</th>\n",
       "      <th>c_99</th>\n",
       "      <th>c_100</th>\n",
       "      <th>c_other</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>684921758</td>\n",
       "      <td>3647864012</td>\n",
       "      <td>2012-10-31T00:00:00.001Z</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>2</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>244999119</td>\n",
       "      <td>3476440521</td>\n",
       "      <td>2012-11-03T00:00:00.001Z</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>2</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3928440935</td>\n",
       "      <td>517514445</td>\n",
       "      <td>2012-11-05T00:00:00.001Z</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2582345152</td>\n",
       "      <td>781585781</td>\n",
       "      <td>2012-10-30T00:00:00.001Z</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1051165850</td>\n",
       "      <td>1016098580</td>\n",
       "      <td>2012-09-27T00:00:00.001Z</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 110 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     event_id     user_id                start_time city state zip country  \\\n",
       "0   684921758  3647864012  2012-10-31T00:00:00.001Z                          \n",
       "1   244999119  3476440521  2012-11-03T00:00:00.001Z                          \n",
       "2  3928440935   517514445  2012-11-05T00:00:00.001Z                          \n",
       "3  2582345152   781585781  2012-10-30T00:00:00.001Z                          \n",
       "4  1051165850  1016098580  2012-09-27T00:00:00.001Z                          \n",
       "\n",
       "  lat lng c_1   ...   c_92 c_93 c_94 c_95 c_96 c_97 c_98 c_99 c_100 c_other  \n",
       "0           2   ...      0    1    0    0    0    0    0    0     0       9  \n",
       "1           2   ...      0    0    0    0    0    0    0    0     0       7  \n",
       "2           0   ...      0    0    0    0    0    0    0    0     0      12  \n",
       "3           1   ...      0    0    0    0    0    0    0    0     0       8  \n",
       "4           1   ...      0    0    0    0    0    0    0    0     0       9  \n",
       "\n",
       "[5 rows x 110 columns]"
      ]
     },
     "execution_count": 322,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "events.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 323,
   "metadata": {},
   "outputs": [],
   "source": [
    "#删除无用的列\n",
    "dropColumns = [\"event_id\",\"user_id\",\"start_time\",\"city\",\"state\",\"zip\",\"country\",\"lat\",\"lng\",\"c_other\"]\n",
    "events = events.drop(columns=dropColumns,axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 324,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>c_1</th>\n",
       "      <th>c_2</th>\n",
       "      <th>c_3</th>\n",
       "      <th>c_4</th>\n",
       "      <th>c_5</th>\n",
       "      <th>c_6</th>\n",
       "      <th>c_7</th>\n",
       "      <th>c_8</th>\n",
       "      <th>c_9</th>\n",
       "      <th>c_10</th>\n",
       "      <th>...</th>\n",
       "      <th>c_91</th>\n",
       "      <th>c_92</th>\n",
       "      <th>c_93</th>\n",
       "      <th>c_94</th>\n",
       "      <th>c_95</th>\n",
       "      <th>c_96</th>\n",
       "      <th>c_97</th>\n",
       "      <th>c_98</th>\n",
       "      <th>c_99</th>\n",
       "      <th>c_100</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 100 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   c_1  c_2  c_3  c_4  c_5  c_6  c_7  c_8  c_9  c_10  ...    c_91  c_92  c_93  \\\n",
       "0    2    0    2    0    0    0    0    0    0     0  ...       0     0     1   \n",
       "1    2    0    2    0    0    0    0    0    0     0  ...       0     0     0   \n",
       "2    0    0    0    0    0    0    0    0    0     0  ...       0     0     0   \n",
       "3    1    0    2    1    0    0    0    0    0     0  ...       0     0     0   \n",
       "4    1    1    0    0    0    0    0    2    0     0  ...       0     0     0   \n",
       "\n",
       "   c_94  c_95  c_96  c_97  c_98  c_99  c_100  \n",
       "0     0     0     0     0     0     0      0  \n",
       "1     0     0     0     0     0     0      0  \n",
       "2     0     0     0     0     0     0      0  \n",
       "3     0     0     0     0     0     0      0  \n",
       "4     0     0     0     0     0     0      0  \n",
       "\n",
       "[5 rows x 100 columns]"
      ]
     },
     "execution_count": 324,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "events = events.astype(int)\n",
    "events.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 325,
   "metadata": {},
   "outputs": [],
   "source": [
    "def K_cluster_analysis(K, events):\n",
    "    print(\"K-means begin with clusters: {}\".format(K));\n",
    "    mb_kmeans = MiniBatchKMeans(n_clusters = K)\n",
    "    mb_kmeans.fit(events)\n",
    "    result = mb_kmeans.predict(events)\n",
    "    CH_score = metrics.silhouette_score(events,result)\n",
    "    print(\"CH_score: {}\".format(CH_score))\n",
    "    return CH_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 326,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "K-means begin with clusters: 10\n",
      "CH_score: -0.05450313524371254\n",
      "K-means begin with clusters: 20\n",
      "CH_score: 0.13588270391183052\n",
      "K-means begin with clusters: 30\n",
      "CH_score: 0.1381873300633514\n",
      "K-means begin with clusters: 40\n",
      "CH_score: 0.1327555544332936\n",
      "K-means begin with clusters: 50\n",
      "CH_score: 0.155193824427899\n",
      "K-means begin with clusters: 60\n",
      "CH_score: 0.1683235885520923\n",
      "K-means begin with clusters: 70\n",
      "CH_score: 0.133401610480347\n",
      "K-means begin with clusters: 80\n",
      "CH_score: 0.1402066922139718\n",
      "K-means begin with clusters: 90\n",
      "CH_score: 0.15684085063267525\n"
     ]
    }
   ],
   "source": [
    "# 设置超参数（聚类数目K）搜索范围\n",
    "Ks = range(10,100,10)\n",
    "CH_scores = []\n",
    "for K in Ks:\n",
    "    ch = K_cluster_analysis(K, events)\n",
    "    CH_scores.append(ch)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 327,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[<matplotlib.lines.Line2D at 0x7ff523c9cb38>]"
      ]
     },
     "execution_count": 327,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYYAAAD8CAYAAABzTgP2AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvIxREBQAAHl9JREFUeJzt3X28lHWd//HXB/AgEAoIGgl4joo3pAh6BNLSylJ0TTJ1w9rCpKzW2tr62drmTenD3ezRmr92rbTAbrbWSi1J3Yg029YZbg4IBCJ6UpAjIiCSKAoczuf3x+c6P2aOA5ybOXPNzfv5eJzHzFxzDfNh5pzrPdfne32vMXdHRESkXZ+0CxARkfKiYBARkTwKBhERyaNgEBGRPAoGERHJo2AQEZE8CgYREcmjYBARkTwKBhERydMv7QK6Y/jw4V5fX592GSIiFWXx4sWb3X3E/taryGCor6+nqakp7TJERCqKma3tzHpqJYmISB4Fg4iI5FEwiIhIHgWDiIjkUTCIiEgeBYOIiORRMIiISJ6KnMcgIp2zdSv89rdQVwcXXghmaVcklUDBIFJl1qyB3/wG7rsP/vhHaG2N5ZddBt/9Lhx4YJrVSSVQMIhUOHdYvBjmzIkwWL48lh9/PHzxi3DBBTB3LtxwA6xaBb/6FYwcmW7NUt4UDCIVaMcOePjhCIM5c2D9eujTB97+dvjmNyMMxo7ds/5pp8H48TBjBjQ2RjhMmpRe/VLeFAwiFeLFF+HBB2OvYO5ceOUVGDQIpk6NIDjvPBg+fO+Pv+iiCItp0+CMM+COO+CjHy1d/VI5FAwiZay5eU+L6H//F9raog304Q/HBv5d7+ramMH48bBoEfzt38bew7JlcPPN0E9bAsmhXweRMtLWBgsW7AmDVati+fjx8M//HGFw8snRNuqu4cNjj+MLX4BbboEVK+Cuu2Do0OL8H6TyKRhEUrZ9O/z+9xEGv/kNbNwYn+DPPBM+9Sl43/ugoaG4z3nAAfDv/w4nnQR///cx3nDffTBuXHGfRyqTgkEkBS+8APffH2Ewbx689hocdBCce27sFUydWppP8B//eBy9dNFFMGUK/PSnEURS2xQMIiXgDk88EZ/K58yB+fNj2ZgxMHPmngHhurrS13b66THucOGFUceNN0bbSpPhapeCQaSXtLZCJrNnvKC5OZafcgp89auxER4/vjw2wKNHw5/+FHsQ11wTg9J33hlHPUl52LABHnkEpk/v/edSMIgU0bZt8LvfRRg88EAcYlpXB+9+dwz2vu99MGpU2lUWNmAA/Od/woQJ8E//BE89Bb/+NRxxRNqV1a7W1jilyQ9+EK3HtrYYe+rtCYoKBpEiaG6OvYBf/hJ27ozxgb/5m9grOPvsGD+oBGZw1VVwwglw6aUxGe6ee6LNJaXT3AyzZ8MPfwjPPw+HHhofLC6/vDSz1hUMIj3Q0hI9+VmzYs/gk5+MgdzTT6/suQHnnhuHzU6bBmedBd/+Nnz602lXVd1eey1CeNasaBn16RPvw8yZcP75cSRZqRTltNtmNtXMVptZs5ldXeD+M8xsiZm1mtnFHe7bbWZLk585xahHpLdt2hTnITr66OjFf/rT8Je/xAb0zDMrOxTaHXtshMPZZ8chrZ/6VOwNSXEtWQJXXhl7Ah/5CKxbBzfdBM8+G+2jCy8sbShAEfYYzKwvcBvwXqAFWGRmc9z98ZzVngUuA/5PgX/iNXef0NM6RErhr3+NSWG33BLzDz76Ubj+eqivT7uy3nHwwTFecs018PWvw+OPw913R2tDuu+ll+LQ4FmzYOnSmL1+0UWxd3DmmT2bwFgMxfhcMwlodvenAczsLmAa8P+Dwd3XJPe1FeH5REpu+3a47bbYOG7ZAhdfHGcrPf74tCvrfX37wr/+axxBdfnlMe5w330wcWLalVWWtrZoEc2aFS2jHTviNfyP/4APfai8Zp4XI5cOB9bl3G5JlnXWgWbWZGbzzez9e1vJzK5I1mvatGlTd2sV6ZKdO+E734mW0Ze+BJMnxymuf/nL2giFXJdeGudrco8xlJ//PO2KKsNzz0VraOzYGK954IE4LHjJkj1tpHIKBShOMBQ6Ctu78Pgx7t4IfAi41cyOKrSSu9/h7o3u3jhixIju1CnSabt3w49/HH32K6+Eo46C//mfOLvpySenXV16TjkFmpriNZg+PSbC7d6ddlXlZ9cuuPfeODJtzJhoxR1xRBwO/PzzsZdQzntcxQiGFmB0zu1RwPrOPtjd1yeXTwOPAGX8ckm1c48/6PbvLhg6FP77vyMU3vGOtKsrD4cdFt8F8YlPRItp2rQYe5GY3X7VVTFX5aKLYvzg6qvj8NOHH46z4g4YkHaV+1eMYFgEjDWzBjOrA6YDnTq6yMyGmln/5Ppw4HRyxiZESsU9JqZNmhR/0G1t0S5qaorzFpXD7ORyUlcHt98e4y5z58Z5lp56Ku2q0vHKK3Fk2tvfHu3FW2+NL0a6/35YuzbaSEcV7IOUrx4Hg7u3Ap8B5gKrgF+4+0ozu8HMLgAws1PNrAW4BLjdzFYmDz8eaDKzZcAfgK93OJpJpNc9+mh8r8E558RhqHfeCX/+cwwwp310SDkzi8NY582L123SpAiJWuAeh/JecUUcZnr55fEafOMbMbflV7+KNlKlHrZs7l0ZDigPjY2N3tTUlHYZUuGWLo3e7wMPRHvkmmuiPdK/f9qVVZ41a6KltGJFfPHPF79YnXtZmzfDT34SRxatXAkDB8aXHs2cGQPy5f5/NrPFyZjuPlVonol035NPwnXXxVE1Q4ZEn/yzn9UJ43qivj5OGHjZZdFjX7Ysvjq0Evrp+7N7d3xfxqxZce6oXbti7+j222MAvlJOd9IVCgapGc8+G3MPfvjDmFB0zTXxyXbIkLQrqw6DBsEvfhE99WuvjYHYX/8aDu/KwetlZO3aaCvOnh2zkYcNi9bZzJlw4olpV9e7FAxS9V54IfYKvvvduP3Zz8KXv6zZu73BLAL3xBPh7/4uJsPdey+87W1pV7Zvra3wzDMRZqtWxR7C738f973nPfDNb0arrFbajAoGqVpbt8Yf9K23wuuvw8c+Fp9kx4xJu7LqN21afBnRtGnwznfC974Xr3/atm+H1atj498eAk88Ee3F3PNANTREu/FjH6vN044rGGqAO7z6Krz88p6fbds6d7t///gjqa/fc1lfDyNGlO9A26uvxsnsvvGNCIfp0+FrX4Njjkm7stry1rfCwoXwwQ/GUTtLl8K//VtpjtTZvDk2+rkBsGpVtIfa9ekDRx4Zh5iedx4cd1xcP+44tRcVDGVsx47ubcw73t62LcJhf/r3h8GDYzDtoIPi+ubNcSz/iy/mrztwYH5Y5IZGQ0NMDCt1cOzYEQOeN90U7aPzz49TYk/QKRpTM2xYTBC86qrYc1uxIsYhDjmk5/92W1ts6HM3/O3Xc39fBwyIjf1pp8X4QHsAjB1bO62hrlIwlIFvfzsmU3XcsO/atf/HmuVvyA86KM6IOXp0/rKO63S8PXjwvv9Itm2LQxLXrIlebO7lo4/GJ/Ncgwe/MSxyLw8+uHuvVSGtrXEI4Ve/GgPMZ54Zfe3TTivec0j39esH3/oWnHRSfF/FqafGSfg6O4D7+usxea5j+2f16vgOg3bDh8cG/wMfiMv2T/9jxmg+SldpHkPK3GMQdODA+GS7tw343jbqgwaVR0tn69b8sOgYIK+8kr/+0KF7D436enjTm/b/nG1tcZbKa6+NjURjI/zLv8RgYTm8JvJGCxbE9wu8/HKE+YUX7rnvpZfyN/ztewHPPBPvNcT7esQR+Rv+9svhw9P5P1WSzs5jUDCkrLk5dmlvvz1mUVYj9zhVdaG9jfbruZ/8IP7IC4VGQ0NsGB55BL7yFXjssehl33gjvP/9CoRKsH59BMLChTG7vH084IUX9qxTVxdjQh0D4Jhj4kOUdI8muFWITCYuq7ntYRY95UMOibNzduQepxMotLexbFl8UcyOHW98XENDfOq89NL4zgCpDG95C/zxj3HY8H33xSnNzzsvPwQaGvSepknBkLJMJtpC48alXUl6zKKdduih8X0HHbW1wYYN+aExcmQcJ19XV+pqpRgOPBC+//34kfKjYEhZNhtnptTg2N716ROfMt/yluresxIpF9ocpejll+MsntrYiUg5UTCkaMGC6K+X++kCRKS2KBhSlM1Gf71QX11EJC0KhhRlMnDCCcWd7CUi0lMKhpS0tcVJxtRGEpFyo2BIyapV8QXqGngWkXKjYEhJLUxsE5HKpGBISSYTp304+ui0KxERyadgSEk2G+MLOrePiJQbBUMKNm+Os4GqjSQi5UjBkIL58+NSRySJSDlSMKQgm40zR556atqViIi8kYIhBZkMTJyo88qLSHlSMJRYa2t8QYnaSCJSrhQMJbZ8OWzfroFnESlfCoYS08Q2ESl3CoYSy2TiC2dGj067EhGRwhQMJZbNxt6CJraJSLlSMJTQ+vXxncVqI4lIOVMwlFA2G5c6IklEypmCoYSyWejfP+YwiIiUKwVDCWUy0NgY4SAiUq4UDCWyYwcsXqw2koiUv6IEg5lNNbPVZtZsZlcXuP8MM1tiZq1mdnGH+2aY2VPJz4xi1FOOliyBnTs18Cwi5a/HwWBmfYHbgHOBccClZjauw2rPApcBP+vw2GHA9cBkYBJwvZkN7WlN5ah9Ypv2GESk3BVjj2ES0OzuT7v7TuAuYFruCu6+xt2XA20dHnsOMM/dt7j7S8A8YGoRaio72Sw0NMCb35x2JSIi+1aMYDgcWJdzuyVZ1tuPrRju8OijaiOJSGUoRjAUmsPrxX6smV1hZk1m1rRp06ZOF1cO1q6FDRsUDCJSGYoRDC1A7pl/RgHri/1Yd7/D3RvdvXHEiBHdKjQtmtgmIpWkGMGwCBhrZg1mVgdMB+Z08rFzgbPNbGgy6Hx2sqyqZDIwaBCceGLalYiI7F+Pg8HdW4HPEBv0VcAv3H2lmd1gZhcAmNmpZtYCXALcbmYrk8duAW4kwmURcEOyrKpkMjB5MvTrl3YlIiL7V5RNlbs/CDzYYdl1OdcXEW2iQo+dDcwuRh3l6NVXYdkyuPoNsztERMqTZj73skWLYPduDTyLSOVQMPSy9oltU6akW4eISGcpGHpZNgvHHQfDhqVdiYhI5ygYepF77DGojSQilUTB0IuefBK2bFEwiEhlUTD0Ik1sE5FKpGDoRZkMDBkSYwwiIpVCwdCLMpnYW+ijV1lEKog2Wb1k61Z4/HG1kUSk8igYesmCBXFUkgaeRaTSKBh6SSYTLaRJk9KuRESkaxQMvSSbjbOpDh6cdiUiIl2jYOgFu3fD/PlqI4lIZVIw9IKVK2HbNgWDiFQmBUMv0MQ2EalkCoZekMnAoYfCkUemXYmISNcpGHpB+4nzzNKuRESk6xQMRbZpEzQ3q40kIpVLwVBk7eMLGngWkUqlYCiyTAYOOABOOSXtSkREukfBUGTZLEycCAMGpF2JiEj3KBiKaNcuWLhQbSQRqWwKhiJauhRef13BICKVTcFQRJrYJiLVQMFQRJkMjB4No0alXYmISPcpGIqofWKbiEglUzAUSUsLrFunNpKIVD4FQ5FoYpuIVAsFQ5FkMjF3YcKEtCsREekZBUORZLPQ2BiznkVEKpmCoQheew2WLFEbSUSqg4KhCBYvjlnPCgYRqQYKhiJoH3ieMiXdOkREikHBUASZDBx9dHxrm4hIpVMw9JC7JraJSHVRMPTQM8/Axo2a2CYi1aMowWBmU81stZk1m9nVBe7vb2Y/T+5fYGb1yfJ6M3vNzJYmP98rRj2llMnEpfYYRKRa9OvpP2BmfYHbgPcCLcAiM5vj7o/nrDYTeMndjzaz6cDNwAeT+/7i7hU7LSyTgcGD4a1vTbsSEZHiKMYewySg2d2fdvedwF3AtA7rTAN+lFy/GzjLzKwIz526bBYmT4a+fdOuRESkOIoRDIcD63JutyTLCq7j7q3AX4FDkvsazOwxM/ujmb1jb09iZleYWZOZNW3atKkIZffctm2wfLnaSCJSXYoRDIU++Xsn13keGOPuE4EvAD8zs4MKPYm73+Huje7eOGLEiB4VXCwLF0Jbm4JBRKpLMYKhBRidc3sUsH5v65hZP+BgYIu773D3FwHcfTHwF+CYItRUEu0T2yZPTrcOEZFiKkYwLALGmlmDmdUB04E5HdaZA8xIrl8MPOzubmYjksFrzOxIYCzwdBFqKolMJgadhwxJuxIRkeLp8VFJ7t5qZp8B5gJ9gdnuvtLMbgCa3H0OMAv4iZk1A1uI8AA4A7jBzFqB3cCn3H1LT2sqhba22GO45JK0KxERKa4eBwOAuz8IPNhh2XU5118H3rAJdfd7gHuKUUOprV4NW7dqfEFEqo9mPndT+8Q2zXgWkWqjYOimTAaGDYNjKmaoXESkcxQM3ZTNRhupOqbpiYjsoWDohi1bYNUqtZFEpDopGLph/vy41MCziFQjBUM3ZLNxbqRTT027EhGR4lMwdEMmAyedBIMGpV2JiEjxKRi6qLUVFixQG0lEqpeCoYtWrIBXX1UwiEj1UjB0kSa2iUi1UzB0USYDI0fCEUekXYmISO9QMHSRJraJSLVTMHTBhg3w9NNqI4lIdVMwdEH7F/No4FlEqpmCoQuyWairg5NPTrsSEZHeo2DogkwGTjkF+vdPuxIRkd6jYOiknTuhqUltJBGpfgqGTnrsMdixQ8EgItVPwdBJmtgmIrVCwdBJmQzU18fkNhGRaqZg6AT3CAa1kUSkFigYOmHdOli/Xm0kEakNCoZOaB9f0B6DiNQCBUMnZLMwcCCMH592JSIivU/B0AmZDEyaBP36pV2JiEjvUzDsx/btsHSp2kgiUjsUDPvR1BRf56lgEJFaoWDYj/aB5ylT0q1DRKRUFAz7kc3CscfCIYekXYmISGkoGPZBE9tEpBYpGPahuRk2b9bENhGpLQqGfdA3tolILVIw7EMmAwcfDMcfn3YlIiKlo2DYh0wmjkbqo1dJRGqINnl78fLLsGKF2kgiUnuKEgxmNtXMVptZs5ldXeD+/mb28+T+BWZWn3Pfl5Plq83snGLUUwwLFsRRSQoGEak1PQ4GM+sL3AacC4wDLjWzcR1Wmwm85O5HA98Cbk4eOw6YDrwVmAp8J/n3UpfJgFmcI0lEpJYUY49hEtDs7k+7+07gLmBah3WmAT9Krt8NnGVmliy/y913uPszQHPy76Uum4UTT4SDDkq7EhGR0ipGMBwOrMu53ZIsK7iOu7cCfwUO6eRjS66tLYJBbSQRqUXFCAYrsMw7uU5nHhv/gNkVZtZkZk2bNm3qYold8/jjMfisiW0iUouKEQwtwOic26OA9Xtbx8z6AQcDWzr5WADc/Q53b3T3xhEjRhSh7L3TxDYRqWXFCIZFwFgzazCzOmIweU6HdeYAM5LrFwMPu7sny6cnRy01AGOBhUWoqUcyGRgxAo46Ku1KRERKr8ffSeburWb2GWAu0BeY7e4rzewGoMnd5wCzgJ+YWTOxpzA9eexKM/sF8DjQClzp7rt7WlNPZTLRRrJCjS4RkSpXlC+rdPcHgQc7LLsu5/rrwCV7eexNwE3FqKMYNm+GJ5+Eyy9PuxIRkXRo5nMH8+fHpcYXRKRWKRg6yGSgXz9obEy7EhGRdCgYOshmYeJEGDAg7UpERNKhYMixaxcsXKg2kojUNgVDjuXLYft2TWwTkdqmYMihiW0iIgqGPJkMjBoFo0fvf10RkWqlYMjRPrFNRKSWKRgS69fD2rVqI4mIKBgSGl8QEQkKhkQmAwceCBMmpF2JiEi6FAyJbDZmO9fVpV2JiEi6FAzA66/D4sVqI4mIgIIBgCVLYOdOHZEkIgIKBmDPwLOCQUREwQDEwPNRR8Fhh6VdiYhI+mo+GNw1sU1EJFfNB8PatbBhgwaeRUTa1XwwZDJxqWAQEQkKhgy86U1wwglpVyIiUh5qPhiyWZg8Gfr2TbsSEZHyUNPB8MorsGyZ2kgiIrlqOhgWLYLdu3VEkohIrpoOhvaJbVOmpFuHiEg5qelgyGRg3DgYOjTtSkREykfNBoN77DGojSQikq9mg+HJJ2HLFg08i4h0VLPBoIltIiKF1XQwDB0KxxyTdiUiIuWlZoOhfXyhT82+AiIihdXkZnHrVli5Um0kEZFCajIY5s+PSx2RJCLyRjUZDNlstJAmTUq7EhGR8lOTwZDJwEknxVlVRUQkX80Fw+7d0UpSG0lEpLAeBYOZDTOzeWb2VHJZ8OQSZjYjWecpM5uRs/wRM1ttZkuTn0N7Uk9nrFwZZ1XVwLOISGE93WO4GnjI3ccCDyW385jZMOB6YDIwCbi+Q4B82N0nJD8be1jPfmlim4jIvvU0GKYBP0qu/wh4f4F1zgHmufsWd38JmAdM7eHzdlsmA4cdBvX1aVUgIlLeehoMh7n78wDJZaFW0OHAupzbLcmydncmbaRrzcx6WM9+ZbOxt9D7zyQiUpn67W8FM/s98OYCd32lk89RaBPsyeWH3f05MxsM3AN8BPjxXuq4ArgCYMyYMZ186nwbN0JzM3zyk916uIhITdhvMLj7e/Z2n5m9YGYj3f15MxsJFBojaAHemXN7FPBI8m8/l1xuM7OfEWMQBYPB3e8A7gBobGz0QuvsT/sX8+iIJBGRvetpK2kO0H6U0QzgvgLrzAXONrOhyaDz2cBcM+tnZsMBzOwA4HxgRQ/r2adsFg44AE45pTefRUSksvU0GL4OvNfMngLem9zGzBrN7AcA7r4FuBFYlPzckCzrTwTEcmAp8Bzw/R7Ws0+ZTITCgQf25rOIiFS2/baS9sXdXwTOKrC8Cfh4zu3ZwOwO67wKlPSze2MjjB5dymcUEak8PQqGSnPLLWlXICJS/mrulBgiIrJvCgYREcmjYBARkTwKBhERyaNgEBGRPAoGERHJo2AQEZE8CgYREclj7t06H12qzGwTsLabDx8ObC5iOcWiurpGdXWN6uqaaq3rCHcfsb+VKjIYesLMmty9Me06OlJdXaO6ukZ1dU2t16VWkoiI5FEwiIhInloMhjvSLmAvVFfXqK6uUV1dU9N11dwYg4iI7Fst7jGIiMg+VHUwmNlsM9toZitylg0zs3lm9lRyObTENY02sz+Y2SozW2lmnyuTug40s4Vmtiyp62vJ8gYzW5DU9XMzqytlXTn19TWzx8zs/jKra42Z/dnMlppZU7Is1fcyqWGImd1tZk8kv2tvS7suMzs2eZ3af142s8+nXVdS2z8mv/crzOy/kr+H1H/HzOxzSU0rzezzybJef72qOhiAHwJTOyy7GnjI3ccCDyW3S6kV+KK7Hw9MAa40s3FlUNcO4N3ufhIwAZhqZlOAm4FvJXW9BMwscV3tPgesyrldLnUBvMvdJ+QcRpj2ewnwf4HfuvtxwEnEa5dqXe6+OnmdJhDf3rgd+FXadZnZ4cA/AI3ufgLQF5hOyr9jZnYC8AlgEvEenm9mYynF6+XuVf0D1AMrcm6vBkYm10cCq1Ou7z7i+7LLpi5gILAEmExMpumXLH8bMDeFekYlfwDvBu4HrBzqSp57DTC8w7JU30vgIOAZkjHEcqmrQy1nA4+WQ13A4cA6YBjxrZb3A+ek/TsGXAL8IOf2tcCXSvF6VfseQyGHufvzAMnloWkVYmb1wERgQTnUlbRrlgIbgXnAX4Ct7t6arNJC/BGV2q3EH0RbcvuQMqkLwIHfmdliM7siWZb2e3kksAm4M2m//cDMBpVBXbmmA/+VXE+1Lnd/Dvgm8CzwPPBXYDHp/46tAM4ws0PMbCBwHjCaErxetRgMZcHM3gTcA3ze3V9Oux4Ad9/tsZs/ith9Pb7QaqWsyczOBza6++LcxQVWTevwutPd/WTgXKIteEZKdeTqB5wMfNfdJwKvkk47q6CkV38B8Mu0awFIevTTgAbgLcAg4v3sqKS/Y+6+imhnzQN+CywjWtG9rhaD4QUzGwmQXG4sdQFmdgARCj9193vLpa527r4VeIQYAxliZv2Su0YB60tczunABWa2BriLaCfdWgZ1AeDu65PLjUS/fBLpv5ctQIu7L0hu300ERdp1tTsXWOLuLyS3067rPcAz7r7J3XcB9wKnUQa/Y+4+y91PdvczgC3AU5Tg9arFYJgDzEiuzyB6/CVjZgbMAla5+y1lVNcIMxuSXB9A/LGsAv4AXJxWXe7+ZXcf5e71RPvhYXf/cNp1AZjZIDMb3H6d6JuvIOX30t03AOvM7Nhk0VnA42nXleNS9rSRIP26ngWmmNnA5O+z/fUqh9+xQ5PLMcAHiNet91+vUg6mlPoneRGfB3YRn6JmEv3ph4jkfQgYVuKa3k7ski4HliY/55VBXeOBx5K6VgDXJcuPBBYCzcSuf/8U3893AveXS11JDcuSn5XAV5Llqb6XSQ0TgKbk/fw1MLRM6hoIvAgcnLOsHOr6GvBE8rv/E6B/mfyO/YkIqWXAWaV6vTTzWURE8tRiK0lERPZBwSAiInkUDCIikkfBICIieRQMIiKSR8EgIiJ5FAwiIpJHwSAiInn+HwB+JBnuDfWxAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "plt.plot(Ks, np.array(CH_scores), 'b-')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 331,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "MiniBatchKMeans(batch_size=100, compute_labels=True, init='k-means++',\n",
       "        init_size=None, max_iter=100, max_no_improvement=10, n_clusters=60,\n",
       "        n_init=3, random_state=None, reassignment_ratio=0.01, tol=0.0,\n",
       "        verbose=0)"
      ]
     },
     "execution_count": 331,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mb_kmeans = MiniBatchKMeans(n_clusters = 60)\n",
    "mb_kmeans.fit(events)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 332,
   "metadata": {},
   "outputs": [],
   "source": [
    "events[\"cluster\"] = mb_kmeans.predict(events) #保存聚类结果"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 333,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>c_1</th>\n",
       "      <th>c_2</th>\n",
       "      <th>c_3</th>\n",
       "      <th>c_4</th>\n",
       "      <th>c_5</th>\n",
       "      <th>c_6</th>\n",
       "      <th>c_7</th>\n",
       "      <th>c_8</th>\n",
       "      <th>c_9</th>\n",
       "      <th>c_10</th>\n",
       "      <th>...</th>\n",
       "      <th>c_92</th>\n",
       "      <th>c_93</th>\n",
       "      <th>c_94</th>\n",
       "      <th>c_95</th>\n",
       "      <th>c_96</th>\n",
       "      <th>c_97</th>\n",
       "      <th>c_98</th>\n",
       "      <th>c_99</th>\n",
       "      <th>c_100</th>\n",
       "      <th>cluster</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>44</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>44</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>20</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>44</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>37</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 101 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   c_1  c_2  c_3  c_4  c_5  c_6  c_7  c_8  c_9  c_10   ...     c_92  c_93  \\\n",
       "0    2    0    2    0    0    0    0    0    0     0   ...        0     1   \n",
       "1    2    0    2    0    0    0    0    0    0     0   ...        0     0   \n",
       "2    0    0    0    0    0    0    0    0    0     0   ...        0     0   \n",
       "3    1    0    2    1    0    0    0    0    0     0   ...        0     0   \n",
       "4    1    1    0    0    0    0    0    2    0     0   ...        0     0   \n",
       "\n",
       "   c_94  c_95  c_96  c_97  c_98  c_99  c_100  cluster  \n",
       "0     0     0     0     0     0     0      0       44  \n",
       "1     0     0     0     0     0     0      0       44  \n",
       "2     0     0     0     0     0     0      0       20  \n",
       "3     0     0     0     0     0     0      0       44  \n",
       "4     0     0     0     0     0     0      0       37  \n",
       "\n",
       "[5 rows x 101 columns]"
      ]
     },
     "execution_count": 333,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "events.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "从运行的结果来看效果并不是太好"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
